import pandas as pd
import pandas as pd
# Load the dataset
file_path = '/content/insurance.csv'
data = pd.read_csv(file_path)
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())
# Display the total number of records
print("\nTotal number of records:")
print(data.shape[0])
# Display the column headers
print("\nColumn headers:")
print(data.columns.tolist())
First few rows of the dataset: age sex bmi children smoker region charges 0 19 female 27.900 0 yes southwest 16884.92400 1 18 male 33.770 1 no southeast 1725.55230 2 28 male 33.000 3 no southeast 4449.46200 3 33 male 22.705 0 no northwest 21984.47061 4 32 male 28.880 0 no northwest 3866.85520 Total number of records: 1338 Column headers: ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']
Data Preprocessing
# Check for missing values
print("Missing values in each column:")
print(data.isnull().sum())
# Encode categorical variables
data_encoded = pd.get_dummies(data, columns=['sex', 'smoker', 'region'], drop_first=True)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numerical_features = ['age', 'bmi', 'children', 'charges']
data_encoded[numerical_features] = scaler.fit_transform(data_encoded[numerical_features])
# Check the preprocessed data
print("Preprocessed data:")
print(data_encoded.head())
Missing values in each column:
age 0
sex 0
bmi 0
children 0
smoker 0
region 0
charges 0
dtype: int64
Preprocessed data:
age bmi children charges sex_male smoker_yes \
0 -1.438764 -0.453320 -0.908614 0.298584 False True
1 -1.509965 0.509621 -0.078767 -0.953689 True False
2 -0.797954 0.383307 1.580926 -0.728675 True False
3 -0.441948 -1.305531 -0.908614 0.719843 True False
4 -0.513149 -0.292556 -0.908614 -0.776802 True False
region_northwest region_southeast region_southwest
0 False False True
1 False True False
2 False True False
3 True False False
4 True False False
Exploratory Data Analysis (EDA)
import matplotlib.pyplot as plt
import seaborn as sns
# Plot distribution of charges
sns.histplot(data['charges'], kde=True)
plt.title('Distribution of Medical Charges')
plt.show()
# Plot pairwise relationships
sns.pairplot(data_encoded, diag_kind='kde')
plt.show()
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Split the data
X = data_encoded.drop('charges', axis=1)
y = data_encoded['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train the model
model = RandomForestRegressor()
model.fit(X_train, y_train)
# Predict and evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R²: {r2}")
MAE: 0.21439914894316878 RMSE: 0.3801706977926863 R²: 0.8555506171102616
Model Selection and Training Steps
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Load the dataset
df = pd.read_csv('/content/insurance.csv')
# Display the first few rows of the dataset
df.head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
import pandas as pd
from sklearn.model_selection import train_test_split
# Load the dataset
df = pd.read_csv('/content/insurance.csv')
# Check the column names to ensure they match
print("Column names:", df.columns.tolist())
Column names: ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']
# Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())
Missing values in each column: age 0 sex 0 bmi 0 children 0 smoker 0 region 0 charges 0 dtype: int64
# Encode categorical variables
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)
# Define features (X) and target variable (y)
X = df.drop('charges', axis=1)
y = df['charges']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Output the shape of the training and test sets to confirm
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)
Training set shape: (936, 8) Testing set shape: (402, 8)
# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
# Make predictions
rf_predictions = rf_model.predict(X_test)
4.2 Gradient Boosting Machines (GBM)
# Initialize and train the Gradient Boosting model
gbm_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gbm_model.fit(X_train, y_train)
# Make predictions
gbm_predictions = gbm_model.predict(X_test)
4.3 XGBoost
# Initialize and train the XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
# Make predictions
xgb_predictions = xgb_model.predict(X_test)
# Function to evaluate the model
def evaluate_model(y_true, y_pred):
mae = mean_absolute_error(y_true, y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)
return mae, rmse, r2
# Evaluate Random Forest
rf_mae, rf_rmse, rf_r2 = evaluate_model(y_test, rf_predictions)
print(f"Random Forest - MAE: {rf_mae}, RMSE: {rf_rmse}, R²: {rf_r2}")
# Evaluate Gradient Boosting Machines
gbm_mae, gbm_rmse, gbm_r2 = evaluate_model(y_test, gbm_predictions)
print(f"Gradient Boosting - MAE: {gbm_mae}, RMSE: {gbm_rmse}, R²: {gbm_r2}")
# Evaluate XGBoost
xgb_mae, xgb_rmse, xgb_r2 = evaluate_model(y_test, xgb_predictions)
print(f"XGBoost - MAE: {xgb_mae}, RMSE: {xgb_rmse}, R²: {xgb_r2}")
Random Forest - MAE: 2667.1494970211447, RMSE: 4657.026790942916, R²: 0.8520852453249403 Gradient Boosting - MAE: 2488.997678672316, RMSE: 4435.723953515011, R²: 0.8658091057725971 XGBoost - MAE: 2815.9741813944343, RMSE: 4908.251089992976, R²: 0.8356962149835843
# Compare the performance of models
comparison_df = pd.DataFrame({
'Model': ['Random Forest', 'Gradient Boosting', 'XGBoost'],
'MAE': [rf_mae, gbm_mae, xgb_mae],
'RMSE': [rf_rmse, gbm_rmse, xgb_rmse],
'R²': [rf_r2, gbm_r2, xgb_r2]
})
print(comparison_df)
Model MAE RMSE R² 0 Random Forest 2667.149497 4657.026791 0.852085 1 Gradient Boosting 2488.997679 4435.723954 0.865809 2 XGBoost 2815.974181 4908.251090 0.835696
from sklearn.model_selection import GridSearchCV
# Define parameter grid for hyperparameter tuning
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [10, 20, 30],
'min_samples_split': [2, 5, 10]
}
# Perform Grid Search on the best-performing model (e.g., Random Forest)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
# Get the best parameters and retrain the model
best_rf_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 300}
import joblib
# Save the best model
joblib.dump(best_rf_model, 'best_rf_model.pkl')
['best_rf_model.pkl']
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
# Define the model
rf = RandomForestRegressor(random_state=42)
# Define the parameter grid
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# Implement GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
cv=5, n_jobs=-1, verbose=2)
# Fit the model
grid_search.fit(X_train, y_train)
# Best parameters and model
print("Best Parameters:", grid_search.best_params_)
best_rf = grid_search.best_estimator_
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
import joblib
# Save the model to a file
joblib.dump(best_rf, 'best_random_forest_model.pkl')
# Load the model (for deployment)
loaded_model = joblib.load('best_random_forest_model.pkl')
# Make predictions using the loaded model
y_pred = loaded_model.predict(X_test)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
# Predictions
y_pred = loaded_model.predict(X_test)
# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R-Squared: {r2}")
MAE: 2498.701840797714 RMSE: 4358.930672423174 R-Squared: 0.8704152341458199
from flask import Flask, request, jsonify
import joblib
import numpy as np
app = Flask(__name__)
# Load the model
model = joblib.load('best_random_forest_model.pkl')
@app.route('/predict', methods=['POST'])
def predict():
data = request.get_json(force=True)
prediction = model.predict(np.array([data['features']]))
return jsonify({'prediction': prediction[0]})
if __name__ == '__main__':
app.run(debug=True)
* Serving Flask app '__main__' * Debug mode: on
INFO:werkzeug:WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead. * Running on http://127.0.0.1:5000 INFO:werkzeug:Press CTRL+C to quit INFO:werkzeug: * Restarting with stat
import logging
# Configure logging
logging.basicConfig(filename='model_performance.log', level=logging.INFO)
# Log performance metrics
logging.info(f"MAE: {mae}")
logging.info(f"RMSE: {rmse}")
logging.info(f"R-Squared: {r2}")
fullcode
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
# Load and explore the dataset
df = pd.read_csv('/content/insurance.csv')
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(df.head())
# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())
# Display dataset info
print("\nDataset info:")
print(df.info())
# Show column headers and the number of records
print(f"\nTotal number of records: {df.shape[0]}")
print(f"Column headers: {df.columns.tolist()}")
# Data Preprocessing
# Encode categorical variables
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)
# Define features (X) and target variable (y)
X = df.drop('charges', axis=1)
y = df['charges']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Model Training
# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_r2 = r2_score(y_test, rf_pred)
print(f"\nRandom Forest - MAE: {rf_mae}, RMSE: {rf_rmse}, R2: {rf_r2}")
# Gradient Boosting Machine (GBM)
gbm = GradientBoostingRegressor(random_state=42)
gbm.fit(X_train, y_train)
gbm_pred = gbm.predict(X_test)
gbm_mae = mean_absolute_error(y_test, gbm_pred)
gbm_rmse = np.sqrt(mean_squared_error(y_test, gbm_pred))
gbm_r2 = r2_score(y_test, gbm_pred)
print(f"\nGBM - MAE: {gbm_mae}, RMSE: {gbm_rmse}, R2: {gbm_r2}")
# XGBoost
xgb = XGBRegressor(random_state=42)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
xgb_mae = mean_absolute_error(y_test, xgb_pred)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))
xgb_r2 = r2_score(y_test, xgb_pred)
print(f"\nXGBoost - MAE: {xgb_mae}, RMSE: {xgb_rmse}, R2: {xgb_r2}")
# Hyperparameter Tuning for Random Forest
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train, y_train)
# Best parameters and model evaluation
print("\nBest Parameters for Random Forest:", grid_search_rf.best_params_)
best_rf = grid_search_rf.best_estimator_
best_rf_pred = best_rf.predict(X_test)
best_rf_mae = mean_absolute_error(y_test, best_rf_pred)
best_rf_rmse = np.sqrt(mean_squared_error(y_test, best_rf_pred))
best_rf_r2 = r2_score(y_test, best_rf_pred)
print(f"\nBest Random Forest - MAE: {best_rf_mae}, RMSE: {best_rf_rmse}, R2: {best_rf_r2}")
# Save the Best Model
joblib.dump(best_rf, 'best_random_forest_model.pkl')
print("\nBest model saved as 'best_random_forest_model.pkl'")
# Load the Model (for deployment)
loaded_model = joblib.load('best_random_forest_model.pkl')
# Evaluate the Loaded Model
loaded_model_pred = loaded_model.predict(X_test)
loaded_mae = mean_absolute_error(y_test, loaded_model_pred)
loaded_rmse = np.sqrt(mean_squared_error(y_test, loaded_model_pred))
loaded_r2 = r2_score(y_test, loaded_model_pred)
print(f"\nLoaded Model - MAE: {loaded_mae}, RMSE: {loaded_rmse}, R2: {loaded_r2}")
First few rows of the dataset:
age sex bmi children smoker region charges
0 19 female 27.900 0 yes southwest 16884.92400
1 18 male 33.770 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520
Missing values in each column:
age 0
sex 0
bmi 0
children 0
smoker 0
region 0
charges 0
dtype: int64
Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 1338 non-null int64
1 sex 1338 non-null object
2 bmi 1338 non-null float64
3 children 1338 non-null int64
4 smoker 1338 non-null object
5 region 1338 non-null object
6 charges 1338 non-null float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
None
Total number of records: 1338
Column headers: ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']
Random Forest - MAE: 2667.1494970211447, RMSE: 4657.026790942916, R2: 0.8520852453249403
GBM - MAE: 2488.997678672316, RMSE: 4435.723953515011, R2: 0.8658091057725971
XGBoost - MAE: 2815.9741813944343, RMSE: 4908.251089992976, R2: 0.8356962149835843
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters for Random Forest: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Best Random Forest - MAE: 2498.701840797714, RMSE: 4358.930672423174, R2: 0.8704152341458199
Best model saved as 'best_random_forest_model.pkl'
Loaded Model - MAE: 2498.701840797714, RMSE: 4358.930672423174, R2: 0.8704152341458199
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
# Load the dataset
df = pd.read_csv('/content/insurance.csv')
# Display basic information about the dataset
print(f"Total number of records: {len(df)}")
print(f"Column headers: {list(df.columns)}")
# Check for missing values
print("Missing values in each column:\n", df.isnull().sum())
# Encode categorical variables
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)
# Define features (X) and target variable (y)
X = df.drop('charges', axis=1)
y = df['charges']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Initialize and train a Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
# Make predictions
y_pred = rf_model.predict(X_test)
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Random Forest Model - MAE: {mae:.2f}")
print(f"Random Forest Model - RMSE: {rmse:.2f}")
print(f"Random Forest Model - R²: {r2:.2f}")
# Cross-validation for Random Forest
cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='neg_mean_absolute_error')
mean_cv_score = -cv_scores.mean()
print(f"Mean Cross-Validated MAE: {mean_cv_score:.2f}")
# Feature importance analysis
importances = rf_model.feature_importances_
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values('Importance', ascending=False)
# Plot feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importances)
plt.title('Feature Importance')
plt.show()
# Hyperparameter tuning with GridSearchCV for Random Forest
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best Cross-Validated MAE: {-grid_search.best_score_:.2f}")
# Final evaluation with the best parameters
best_rf_model = grid_search.best_estimator_
y_pred_best = best_rf_model.predict(X_test)
mae_best = mean_absolute_error(y_test, y_pred_best)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
r2_best = r2_score(y_test, y_pred_best)
print(f"Best Random Forest Model - MAE: {mae_best:.2f}")
print(f"Best Random Forest Model - RMSE: {rmse_best:.2f}")
print(f"Best Random Forest Model - R²: {r2_best:.2f}")
# Visualize Actual vs. Predicted Charges
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_best)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--', color='red')
plt.xlabel('Actual Charges')
plt.ylabel('Predicted Charges')
plt.title('Actual vs Predicted Charges')
plt.show()
# Save the final model
import joblib
joblib.dump(best_rf_model, 'final_rf_model.pkl')
# Load the model for future predictions (if needed)
# loaded_model = joblib.load('final_rf_model.pkl')
# Example of using the loaded model:
# sample_prediction = loaded_model.predict([X_test.iloc[0]])
# print(f"Sample Prediction: {sample_prediction[0]:.2f}")
Total number of records: 1338 Column headers: ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'] Missing values in each column: age 0 sex 0 bmi 0 children 0 smoker 0 region 0 charges 0 dtype: int64 Random Forest Model - MAE: 2667.15 Random Forest Model - RMSE: 4657.03 Random Forest Model - R²: 0.85 Mean Cross-Validated MAE: 2710.21
Best parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
Best Cross-Validated MAE: 2853.75
Best Random Forest Model - MAE: 2606.56
Best Random Forest Model - RMSE: 4468.94
Best Random Forest Model - R²: 0.86
['final_rf_model.pkl']
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
# Load the dataset
df = pd.read_csv('/content/insurance.csv')
# Display basic information about the dataset
print(f"Total number of records: {len(df)}")
print(f"Column headers: {list(df.columns)}")
# Check for missing values
print("Missing values in each column:\n", df.isnull().sum())
# Visualize the distribution of the target variable 'charges'
plt.figure(figsize=(10, 6))
sns.histplot(df['charges'], bins=30, kde=True)
plt.title('Distribution of Charges')
plt.xlabel('Charges')
plt.ylabel('Frequency')
plt.show()
# Visualize relationships between features and the target variable
sns.pairplot(df)
plt.suptitle('Pairplot of Features and Target Variable', y=1.02)
plt.show()
# Encode categorical variables
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)
# Define features (X) and target variable (y)
X = df.drop('charges', axis=1)
y = df['charges']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Initialize and train a Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
# Make predictions
y_pred = rf_model.predict(X_test)
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Random Forest Model - MAE: {mae:.2f}")
print(f"Random Forest Model - RMSE: {rmse:.2f}")
print(f"Random Forest Model - R²: {r2:.2f}")
# Cross-validation for Random Forest
cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='neg_mean_absolute_error')
mean_cv_score = -cv_scores.mean()
print(f"Mean Cross-Validated MAE: {mean_cv_score:.2f}")
# Feature importance analysis
importances = rf_model.feature_importances_
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values('Importance', ascending=False)
# Plot feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importances)
plt.title('Feature Importance')
plt.show()
# Hyperparameter tuning with GridSearchCV for Random Forest
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best Cross-Validated MAE: {-grid_search.best_score_:.2f}")
# Final evaluation with the best parameters
best_rf_model = grid_search.best_estimator_
y_pred_best = best_rf_model.predict(X_test)
mae_best = mean_absolute_error(y_test, y_pred_best)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
r2_best = r2_score(y_test, y_pred_best)
print(f"Best Random Forest Model - MAE: {mae_best:.2f}")
print(f"Best Random Forest Model - RMSE: {rmse_best:.2f}")
print(f"Best Random Forest Model - R²: {r2_best:.2f}")
# Visualize Actual vs. Predicted Charges
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_best)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--', color='red')
plt.xlabel('Actual Charges')
plt.ylabel('Predicted Charges')
plt.title('Actual vs Predicted Charges')
plt.show()
# Residuals Plot
residuals = y_test - y_pred_best
plt.figure(figsize=(10, 6))
sns.histplot(residuals, bins=30, kde=True)
plt.title('Distribution of Residuals')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()
# Save the final model
joblib.dump(best_rf_model, 'final_rf_model.pkl')
# Load the model for future predictions (if needed)
# loaded_model = joblib.load('final_rf_model.pkl')
# Example of using the loaded model:
# sample_prediction = loaded_model.predict([X_test.iloc[0]])
# print(f"Sample Prediction: {sample_prediction[0]:.2f}")
Total number of records: 1338 Column headers: ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'] Missing values in each column: age 0 sex 0 bmi 0 children 0 smoker 0 region 0 charges 0 dtype: int64
Random Forest Model - MAE: 2667.15 Random Forest Model - RMSE: 4657.03 Random Forest Model - R²: 0.85 Mean Cross-Validated MAE: 2710.21
Best parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
Best Cross-Validated MAE: 2853.75
Best Random Forest Model - MAE: 2606.56
Best Random Forest Model - RMSE: 4468.94
Best Random Forest Model - R²: 0.86
['final_rf_model.pkl']
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
# Load the dataset
df = pd.read_csv('/content/insurance.csv')
# Data exploration
print(f"Total number of records: {len(df)}")
print(f"Column headers: {list(df.columns)}")
# Check for missing values
print("Missing values in each column:\n", df.isnull().sum())
# Visualize the distribution of the target variable 'charges'
plt.figure(figsize=(10, 6))
sns.histplot(df['charges'], bins=30, kde=True)
plt.title('Distribution of Charges')
plt.xlabel('Charges')
plt.ylabel('Frequency')
plt.savefig('distribution_charges.png') # Save figure for documentation
plt.show()
# Visualize relationships between features and the target variable
sns.pairplot(df)
plt.suptitle('Pairplot of Features and Target Variable', y=1.02)
plt.savefig('pairplot_features_target.png') # Save figure for documentation
plt.show()
# Encode categorical variables
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)
# Define features (X) and target variable (y)
X = df.drop('charges', axis=1)
y = df['charges']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Initialize and train a Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
# Make predictions
y_pred = rf_model.predict(X_test)
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Random Forest Model - MAE: {mae:.2f}")
print(f"Random Forest Model - RMSE: {rmse:.2f}")
print(f"Random Forest Model - R²: {r2:.2f}")
# Cross-validation for Random Forest
cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='neg_mean_absolute_error')
mean_cv_score = -cv_scores.mean()
print(f"Mean Cross-Validated MAE: {mean_cv_score:.2f}")
# Feature importance analysis
importances = rf_model.feature_importances_
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values('Importance', ascending=False)
# Plot feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importances)
plt.title('Feature Importance')
plt.savefig('feature_importance.png') # Save figure for documentation
plt.show()
# Hyperparameter tuning with GridSearchCV for Random Forest
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best Cross-Validated MAE: {-grid_search.best_score_:.2f}")
# Final evaluation with the best parameters
best_rf_model = grid_search.best_estimator_
y_pred_best = best_rf_model.predict(X_test)
mae_best = mean_absolute_error(y_test, y_pred_best)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
r2_best = r2_score(y_test, y_pred_best)
print(f"Best Random Forest Model - MAE: {mae_best:.2f}")
print(f"Best Random Forest Model - RMSE: {rmse_best:.2f}")
print(f"Best Random Forest Model - R²: {r2_best:.2f}")
# Visualize Actual vs. Predicted Charges
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_best)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--', color='red')
plt.xlabel('Actual Charges')
plt.ylabel('Predicted Charges')
plt.title('Actual vs Predicted Charges')
plt.savefig('actual_vs_predicted.png') # Save figure for documentation
plt.show()
# Residuals Plot
residuals = y_test - y_pred_best
plt.figure(figsize=(10, 6))
sns.histplot(residuals, bins=30, kde=True)
plt.title('Distribution of Residuals')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.savefig('residuals_distribution.png') # Save figure for documentation
plt.show()
# Save the final model
joblib.dump(best_rf_model, 'final_rf_model.pkl')
# Load the model for future predictions (if needed)
# loaded_model = joblib.load('final_rf_model.pkl')
# Example of using the loaded model:
# sample_prediction = loaded_model.predict([X_test.iloc[0]])
# print(f"Sample Prediction: {sample_prediction[0]:.2f}")
# Documentation (for record-keeping)
documentation = """
Project Title: Healthcare Cost Prediction Using Machine Learning
Data:
- Dataset: insurance.csv
- Total records: 1338
- Columns: age, sex, bmi, children, smoker, region, charges
Preprocessing:
- Categorical variables encoded using one-hot encoding.
- Features and target variable defined.
- Data split into training and testing sets.
Models Trained:
1. Random Forest Regressor
2. Gradient Boosting Regressor (not shown in this script but would follow a similar pattern)
3. XGBoost Regressor (not shown in this script but would follow a similar pattern)
Evaluation Metrics:
- Mean Absolute Error (MAE)
- Root Mean Squared Error (RMSE)
- R-Squared (R²)
Best Model:
- Random Forest Regressor with tuned parameters
- MAE: {:.2f}
- RMSE: {:.2f}
- R²: {:.2f}
Visualizations:
- Distribution of Charges
- Pairplot of Features and Target Variable
- Feature Importance
- Actual vs. Predicted Charges
- Residuals Distribution
Model:
- Final Random Forest model saved as 'final_rf_model.pkl'
Future Work:
- Further refinement of models.
- Deployment in a production environment.
- Preparation of presentation materials for stakeholders.
""".format(mae_best, rmse_best, r2_best)
# Save documentation to a text file
with open('documentation.txt', 'w') as file:
file.write(documentation)
print("Documentation saved as 'documentation.txt'")
Total number of records: 1338 Column headers: ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'] Missing values in each column: age 0 sex 0 bmi 0 children 0 smoker 0 region 0 charges 0 dtype: int64
Random Forest Model - MAE: 2667.15 Random Forest Model - RMSE: 4657.03 Random Forest Model - R²: 0.85 Mean Cross-Validated MAE: 2710.21
Best parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
Best Cross-Validated MAE: 2853.75
Best Random Forest Model - MAE: 2606.56
Best Random Forest Model - RMSE: 4468.94
Best Random Forest Model - R²: 0.86
Documentation saved as 'documentation.txt'
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
# Load the dataset
df = pd.read_csv('/content/insurance.csv')
# Data Exploration
print(f"Total number of records: {len(df)}")
print(f"Column headers: {list(df.columns)}")
# Check for missing values
print("Missing values in each column:\n", df.isnull().sum())
# Basic statistics
print("Basic statistics of the dataset:\n", df.describe())
# Encode categorical variables
df_encoded = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)
# Data Visualization
# Distribution of the target variable 'charges'
plt.figure(figsize=(10, 6))
sns.histplot(df_encoded['charges'], bins=30, kde=True)
plt.title('Distribution of Charges')
plt.xlabel('Charges')
plt.ylabel('Frequency')
plt.savefig('distribution_charges.png')
plt.show()
# Pairplot to visualize relationships between features
sns.pairplot(df_encoded, hue='smoker_yes')
plt.suptitle('Pairplot of Features and Target Variable', y=1.02)
plt.savefig('pairplot_features_target.png')
plt.show()
# Correlation matrix (only with numeric columns)
numeric_columns = df_encoded.select_dtypes(include=[np.number]).columns
plt.figure(figsize=(12, 8))
correlation_matrix = df_encoded[numeric_columns].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.savefig('correlation_matrix.png')
plt.show()
# Boxplots for features by 'smoker' status
plt.figure(figsize=(15, 10))
plt.subplot(2, 2, 1)
sns.boxplot(x='smoker_yes', y='age', data=df_encoded)
plt.title('Age by Smoker Status')
plt.subplot(2, 2, 2)
sns.boxplot(x='smoker_yes', y='bmi', data=df_encoded)
plt.title('BMI by Smoker Status')
plt.subplot(2, 2, 3)
sns.boxplot(x='smoker_yes', y='children', data=df_encoded)
plt.title('Children by Smoker Status')
plt.subplot(2, 2, 4)
sns.boxplot(x='smoker_yes', y='charges', data=df_encoded)
plt.title('Charges by Smoker Status')
plt.tight_layout()
plt.savefig('boxplots_smoker_status.png')
plt.show()
# Variation Analysis
# Scaling features for better comparison
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_encoded[['age', 'bmi', 'children', 'charges']])
scaled_df = pd.DataFrame(scaled_features, columns=['age', 'bmi', 'children', 'charges'])
# Histograms of scaled features
plt.figure(figsize=(12, 8))
for i, column in enumerate(scaled_df.columns):
plt.subplot(2, 2, i+1)
sns.histplot(scaled_df[column], bins=30, kde=True)
plt.title(f'Scaled Distribution of {column}')
plt.tight_layout()
plt.savefig('scaled_features_distribution.png')
plt.show()
# Comparative Analysis
# T-test between charges for smokers and non-smokers
smokers = df_encoded[df_encoded['smoker_yes'] == 1]['charges']
non_smokers = df_encoded[df_encoded['smoker_yes'] == 0]['charges']
t_stat, p_val = stats.ttest_ind(smokers, non_smokers)
print(f"T-test between charges for smokers and non-smokers:\nT-statistic: {t_stat:.2f}\nP-value: {p_val:.2f}")
# Analysis Summary
summary = f"""
Analysis Summary:
1. Data Exploration:
- Total records: {len(df)}
- Columns: {list(df.columns)}
- Missing values: {df.isnull().sum()}
- Basic statistics: {df.describe()}
2. Data Visualization:
- Distribution of Charges: Saved as 'distribution_charges.png'
- Pairplot of Features: Saved as 'pairplot_features_target.png'
- Correlation Matrix: Saved as 'correlation_matrix.png'
- Boxplots by Smoker Status: Saved as 'boxplots_smoker_status.png'
3. Variation Analysis:
- Scaled distributions of features: Saved as 'scaled_features_distribution.png'
4. Comparative Analysis:
- T-test results between charges for smokers and non-smokers:
T-statistic: {t_stat:.2f}
P-value: {p_val:.2f}
"""
# Save the summary to a text file
with open('analysis_summary.txt', 'w') as file:
file.write(summary)
print("Analysis summary saved as 'analysis_summary.txt'")
Total number of records: 1338
Column headers: ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']
Missing values in each column:
age 0
sex 0
bmi 0
children 0
smoker 0
region 0
charges 0
dtype: int64
Basic statistics of the dataset:
age bmi children charges
count 1338.000000 1338.000000 1338.000000 1338.000000
mean 39.207025 30.663397 1.094918 13270.422265
std 14.049960 6.098187 1.205493 12110.011237
min 18.000000 15.960000 0.000000 1121.873900
25% 27.000000 26.296250 0.000000 4740.287150
50% 39.000000 30.400000 1.000000 9382.033000
75% 51.000000 34.693750 2.000000 16639.912515
max 64.000000 53.130000 5.000000 63770.428010
T-test between charges for smokers and non-smokers: T-statistic: 46.66 P-value: 0.00 Analysis summary saved as 'analysis_summary.txt'
Full Code for Healthcare Cost Prediction Project
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
# Load the dataset
df = pd.read_csv('/content/insurance.csv')
# Display dataset information
print("Dataset Overview:")
print(f"Total number of records: {df.shape[0]}")
print("Column headers:")
print(df.columns.tolist())
# Data Cleaning and Encoding
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)
# Define features and target variable
X = df.drop('charges', axis=1)
y = df['charges']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Model Definitions
models = {
'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
}
# Train and Evaluate Models
results = {}
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
results[name] = {
'MAE': mean_absolute_error(y_test, y_pred),
'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
'R²': r2_score(y_test, y_pred)
}
# Save the model
joblib.dump(model, f'{name}_model.pkl')
# Print evaluation results
print("\nModel Evaluation Results:")
for name, metrics in results.items():
print(f"\n{name}:")
for metric, value in metrics.items():
print(f"{metric}: {value:.2f}")
# Data Visualization
plt.figure(figsize=(12, 8))
sns.histplot(df['charges'], bins=30, kde=True)
plt.title('Distribution of Medical Charges')
plt.xlabel('Charges')
plt.ylabel('Frequency')
plt.show()
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()
plt.figure(figsize=(12, 8))
sns.pairplot(df, vars=['age', 'bmi', 'children', 'charges'], hue='smoker_yes')
plt.title('Pairplot of Features and Charges')
plt.show()
# Next Steps
print("\nNext Steps:")
print("1. Refine Models: Tune hyperparameters and explore feature engineering.")
print("2. Deploy Model: Save the final model and prepare it for deployment.")
print("3. Documentation: Document the process, findings, and code.")
print("4. Presentation: Prepare visualizations and summaries for stakeholders.")
print("5. Further Model Refinement: Explore other models and continuous improvement.")
# Example of loading and using the saved model
loaded_model = joblib.load('Random Forest_model.pkl')
sample_prediction = loaded_model.predict(X_test.head(1))
print("\nSample Prediction using Random Forest Model:")
print(sample_prediction)
Dataset Overview: Total number of records: 1338 Column headers: ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'] Model Evaluation Results: Random Forest: MAE: 2667.15 RMSE: 4657.03 R²: 0.85 Gradient Boosting: MAE: 2489.00 RMSE: 4435.72 R²: 0.87 XGBoost: MAE: 2815.97 RMSE: 4908.25 R²: 0.84
<Figure size 1200x800 with 0 Axes>
Next Steps: 1. Refine Models: Tune hyperparameters and explore feature engineering. 2. Deploy Model: Save the final model and prepare it for deployment. 3. Documentation: Document the process, findings, and code. 4. Presentation: Prepare visualizations and summaries for stakeholders. 5. Further Model Refinement: Explore other models and continuous improvement. Sample Prediction using Random Forest Model: [9161.91545]
Full Code Implementation
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
# Load the dataset
file_path = '/content/insurance.csv'
df = pd.read_csv(file_path)
# Data Preprocessing
# Encode categorical variables
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)
# Define features (X) and target variable (y)
X = df.drop('charges', axis=1)
y = df['charges']
# Split the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
# Define models
models = {
'RandomForest': RandomForestRegressor(),
'GradientBoosting': GradientBoostingRegressor(),
'XGBoost': XGBRegressor()
}
# Train and validate models
results = {}
for model_name, model in models.items():
print(f"Training {model_name}...")
# Train the model
model.fit(X_train, y_train)
# Predict on validation set
y_val_pred = model.predict(X_val)
# Calculate metrics
mae = mean_absolute_error(y_val, y_val_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
r2 = r2_score(y_val, y_val_pred)
results[model_name] = {
'MAE': mae,
'RMSE': rmse,
'R2': r2
}
print(f"{model_name} - MAE: {mae:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}")
# Evaluate the best model on the test set
best_model_name = min(results, key=lambda x: results[x]['MAE'])
best_model = models[best_model_name]
# Train on the full training data
best_model.fit(X_train, y_train)
# Predict on the test set
y_test_pred = best_model.predict(X_test)
# Calculate final metrics
mae_test = mean_absolute_error(y_test, y_test_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
r2_test = r2_score(y_test, y_test_pred)
print(f"\nBest Model: {best_model_name}")
print(f"Test Set - MAE: {mae_test:.4f}, RMSE: {rmse_test:.4f}, R2: {r2_test:.4f}")
# Plot feature importance for the best model
if best_model_name in ['RandomForest', 'GradientBoosting']:
importances = best_model.feature_importances_
elif best_model_name == 'XGBoost':
importances = best_model.feature_importances_
features = X.columns
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(12, 8))
plt.title(f"Feature Importances - {best_model_name}")
plt.bar(range(X.shape[1]), importances[indices], align="center")
plt.xticks(range(X.shape[1]), features[indices], rotation=90)
plt.xlim([-1, X.shape[1]])
plt.show()
Training RandomForest... RandomForest - MAE: 2692.9208, RMSE: 4813.7713, R2: 0.8355 Training GradientBoosting... GradientBoosting - MAE: 2519.5803, RMSE: 4488.7612, R2: 0.8570 Training XGBoost... XGBoost - MAE: 2843.4263, RMSE: 5036.5911, R2: 0.8199 Best Model: GradientBoosting Test Set - MAE: 2701.7032, RMSE: 4740.1324, R2: 0.8551
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
# Load the dataset
file_path = '/content/insurance.csv'
df = pd.read_csv(file_path)
# Data Preprocessing
# Encode categorical variables
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)
# Define features (X) and target variable (y)
X = df.drop('charges', axis=1)
y = df['charges']
# Split the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
# Define models
models = {
'RandomForest': RandomForestRegressor(),
'GradientBoosting': GradientBoostingRegressor(),
'XGBoost': XGBRegressor()
}
# Initialize lists to store metrics
validation_mae = []
validation_rmse = []
validation_r2 = []
test_mae = []
test_rmse = []
test_r2 = []
# Train and validate models
for model_name, model in models.items():
print(f"Training {model_name}...")
# Train the model
model.fit(X_train, y_train)
# Predict on validation set
y_val_pred = model.predict(X_val)
# Calculate validation metrics
mae_val = mean_absolute_error(y_val, y_val_pred)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
r2_val = r2_score(y_val, y_val_pred)
# Store validation results
validation_mae.append(mae_val)
validation_rmse.append(rmse_val)
validation_r2.append(r2_val)
# Predict on test set
y_test_pred = model.predict(X_test)
# Calculate test metrics
mae_test = mean_absolute_error(y_test, y_test_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
r2_test = r2_score(y_test, y_test_pred)
# Store test results
test_mae.append(mae_test)
test_rmse.append(rmse_test)
test_r2.append(r2_test)
print(f"{model_name} - Validation MAE: {mae_val:.4f}, RMSE: {rmse_val:.4f}, R2: {r2_val:.4f}")
print(f"{model_name} - Test MAE: {mae_test:.4f}, RMSE: {rmse_test:.4f}, R2: {r2_test:.4f}")
# Plot accuracy graphs
plt.figure(figsize=(18, 6))
# MAE Plot
plt.subplot(1, 3, 1)
sns.barplot(x=list(models.keys()), y=validation_mae, color='blue', alpha=0.6, label='Validation MAE')
sns.barplot(x=list(models.keys()), y=test_mae, color='red', alpha=0.6, label='Test MAE')
plt.title('Mean Absolute Error (MAE)')
plt.xlabel('Model')
plt.ylabel('MAE')
plt.legend()
# RMSE Plot
plt.subplot(1, 3, 2)
sns.barplot(x=list(models.keys()), y=validation_rmse, color='blue', alpha=0.6, label='Validation RMSE')
sns.barplot(x=list(models.keys()), y=test_rmse, color='red', alpha=0.6, label='Test RMSE')
plt.title('Root Mean Squared Error (RMSE)')
plt.xlabel('Model')
plt.ylabel('RMSE')
plt.legend()
# R2 Plot
plt.subplot(1, 3, 3)
sns.barplot(x=list(models.keys()), y=validation_r2, color='blue', alpha=0.6, label='Validation R2')
sns.barplot(x=list(models.keys()), y=test_r2, color='red', alpha=0.6, label='Test R2')
plt.title('R-Squared (R2)')
plt.xlabel('Model')
plt.ylabel('R2')
plt.legend()
plt.tight_layout()
plt.show()
Training RandomForest... RandomForest - Validation MAE: 2683.6743, RMSE: 4771.0900, R2: 0.8384 RandomForest - Test MAE: 2859.3420, RMSE: 4928.6664, R2: 0.8434 Training GradientBoosting... GradientBoosting - Validation MAE: 2515.6212, RMSE: 4473.6187, R2: 0.8579 GradientBoosting - Test MAE: 2702.3967, RMSE: 4738.2516, R2: 0.8553 Training XGBoost... XGBoost - Validation MAE: 2843.4263, RMSE: 5036.5911, R2: 0.8199 XGBoost - Test MAE: 3104.7768, RMSE: 5193.3021, R2: 0.8261
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
# Load dataset
file_path = '/content/insurance.csv'
df = pd.read_csv(file_path)
# Display basic information
print("Total number of records:", df.shape[0])
print("Column headers:", df.columns.tolist())
# Data Preprocessing
# Check for missing values
print("Missing values in each column:\n", df.isnull().sum())
# Encode categorical variables
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)
# Define features (X) and target variable (y)
X = df.drop('charges', axis=1)
y = df['charges']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Initialize models
models = {
'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
}
# Dictionary to store evaluation metrics
metrics = {}
# Train and evaluate each model
for model_name, model in models.items():
print(f"\nTraining and evaluating {model_name}...")
# Train the model
model.fit(X_train, y_train)
# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
# Evaluate the model
def evaluate_model(y_true, y_pred):
mae = mean_absolute_error(y_true, y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)
return mae, rmse, r2
train_mae, train_rmse, train_r2 = evaluate_model(y_train, y_pred_train)
test_mae, test_rmse, test_r2 = evaluate_model(y_test, y_pred_test)
# Store metrics
metrics[model_name] = {
'Train MAE': train_mae,
'Train RMSE': train_rmse,
'Train R²': train_r2,
'Test MAE': test_mae,
'Test RMSE': test_rmse,
'Test R²': test_r2
}
print(f"Training MAE: {train_mae:.2f}")
print(f"Training RMSE: {train_rmse:.2f}")
print(f"Training R²: {train_r2:.2f}")
print(f"Testing MAE: {test_mae:.2f}")
print(f"Testing RMSE: {test_rmse:.2f}")
print(f"Testing R²: {test_r2:.2f}")
# Convert metrics to DataFrame for better visualization
metrics_df = pd.DataFrame(metrics).T
# Plot metrics
plt.figure(figsize=(14, 7))
# Plot MAE
plt.subplot(1, 3, 1)
metrics_df[['Train MAE', 'Test MAE']].plot(kind='bar', ax=plt.gca(), color=['blue', 'orange'])
plt.title('Mean Absolute Error')
plt.ylabel('MAE')
# Plot RMSE
plt.subplot(1, 3, 2)
metrics_df[['Train RMSE', 'Test RMSE']].plot(kind='bar', ax=plt.gca(), color=['blue', 'orange'])
plt.title('Root Mean Squared Error')
plt.ylabel('RMSE')
# Plot R²
plt.subplot(1, 3, 3)
metrics_df[['Train R²', 'Test R²']].plot(kind='bar', ax=plt.gca(), color=['blue', 'orange'])
plt.title('R-Squared')
plt.ylabel('R²')
plt.tight_layout()
plt.show()
# Save the best-performing model
best_model_name = metrics_df['Test R²'].idxmax()
best_model = models[best_model_name]
joblib.dump(best_model, 'best_model.pkl')
print(f"Best model saved: {best_model_name}")
# Load the best model for making predictions
loaded_model = joblib.load('best_model.pkl')
# Make predictions with the best model
y_pred = loaded_model.predict(X_test)
# Display predictions and actual values
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(results.head())
# Optional: Save results to a CSV file
results.to_csv('predictions.csv', index=False)
print("Predictions saved to predictions.csv")
Total number of records: 1338 Column headers: ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'] Missing values in each column: age 0 sex 0 bmi 0 children 0 smoker 0 region 0 charges 0 dtype: int64 Training and evaluating Random Forest... Training MAE: 1084.26 Training RMSE: 1865.34 Training R²: 0.98 Testing MAE: 2667.15 Testing RMSE: 4657.03 Testing R²: 0.85 Training and evaluating Gradient Boosting... Training MAE: 2063.67 Training RMSE: 3748.94 Training R²: 0.90 Testing MAE: 2489.00 Testing RMSE: 4435.72 Testing R²: 0.87 Training and evaluating XGBoost... Training MAE: 422.51 Training RMSE: 705.36 Training R²: 1.00 Testing MAE: 2815.97 Testing RMSE: 4908.25 Testing R²: 0.84
Best model saved: Gradient Boosting
Actual Predicted
764 9095.06825 10490.274486
887 5272.17580 5626.705407
890 29330.98315 29650.343172
1293 9301.89355 9965.909995
259 33750.29180 33946.957800
Predictions saved to predictions.csv
pip install nbconvert
Requirement already satisfied: nbconvert in /usr/local/lib/python3.10/dist-packages (6.5.4) Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from nbconvert) (4.9.4) Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (4.12.3) Requirement already satisfied: bleach in /usr/local/lib/python3.10/dist-packages (from nbconvert) (6.1.0) Requirement already satisfied: defusedxml in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.7.1) Requirement already satisfied: entrypoints>=0.2.2 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.4) Requirement already satisfied: jinja2>=3.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (3.1.4) Requirement already satisfied: jupyter-core>=4.7 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (5.7.2) Requirement already satisfied: jupyterlab-pygments in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.3.0) Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (2.1.5) Requirement already satisfied: mistune<2,>=0.8.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.8.4) Requirement already satisfied: nbclient>=0.5.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.10.0) Requirement already satisfied: nbformat>=5.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (5.10.4) Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from nbconvert) (24.1) Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (1.5.1) Requirement already satisfied: pygments>=2.4.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (2.16.1) Requirement already satisfied: tinycss2 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (1.3.0) Requirement already satisfied: traitlets>=5.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (5.7.1) Requirement already satisfied: platformdirs>=2.5 in /usr/local/lib/python3.10/dist-packages (from jupyter-core>=4.7->nbconvert) (4.2.2) Requirement already satisfied: jupyter-client>=6.1.12 in /usr/local/lib/python3.10/dist-packages (from nbclient>=0.5.0->nbconvert) (6.1.12) Requirement already satisfied: fastjsonschema>=2.15 in /usr/local/lib/python3.10/dist-packages (from nbformat>=5.1->nbconvert) (2.20.0) Requirement already satisfied: jsonschema>=2.6 in /usr/local/lib/python3.10/dist-packages (from nbformat>=5.1->nbconvert) (4.23.0) Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4->nbconvert) (2.5) Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from bleach->nbconvert) (1.16.0) Requirement already satisfied: webencodings in /usr/local/lib/python3.10/dist-packages (from bleach->nbconvert) (0.5.1) Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (24.2.0) Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (2023.12.1) Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (0.35.1) Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (0.20.0) Requirement already satisfied: pyzmq>=13 in /usr/local/lib/python3.10/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (24.0.1) Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.10/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (2.8.2) Requirement already satisfied: tornado>=4.1 in /usr/local/lib/python3.10/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (6.3.3)
import nbformat
from nbconvert import HTMLExporter
# File paths
notebook_file = 'Maheshinsurance.ipynb'
html_file = 'Maheshinsurance.html'
# Load the notebook
with open(notebook_file, 'r', encoding='utf-8') as file:
notebook_content = nbformat.read(file, as_version=4)
# Convert to HTML
html_exporter = HTMLExporter()
html_body, _ = html_exporter.from_notebook_node(notebook_content)
# Write HTML to file
with open(html_file, 'w', encoding='utf-8') as file:
file.write(html_body)
print(f'HTML file has been created: {html_file}')
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) <ipython-input-64-df314ac7461c> in <cell line: 9>() 7 8 # Load the notebook ----> 9 with open(notebook_file, 'r', encoding='utf-8') as file: 10 notebook_content = nbformat.read(file, as_version=4) 11 FileNotFoundError: [Errno 2] No such file or directory: 'Maheshinsurance.ipynb'